# Description ------------------------------------------------------------------
#   This script combines consumption, price, and weather data for SoCalGas's
#   service area (by zip code).
#   - INPUT: price, consumption, and weather datasets
#   - OUTPUT: zip-by-utility datasets (in DataR/Prices/SoCal)
#   - NEXT SCRIPT: buildPanelCommonZips.R

# Exclusions -------------------------------------------------------------------
#   - I dropped SoCal price data from advice letters 3644, 3680, 3695, 3807,
#     4053, and 4061, as they were updated by letters 3660, 3697, 3697, 3810,
#     4055, and 4069, respectively.
#   - I dropped pre-2008 data (PG&E and prices/allowances), as SoCalGas did not
#     share billing data for pre-2009 bills.
#   - I trimmed the shortest 2.5% and longest 2.5% bills (resulted in keeping
#     bills of length between 28-34 days). I did this to omit the first or last
#     bills for a household.

# Setup ------------------------------------------------------------------------
  # Options
  options(stringsAsFactors = FALSE)
  # Packages
  library(readstata13)
  library(magrittr)
  library(data.table)
  library(stringr)
  library(lubridate)
  library(foreach)
  library(doParallel)
  # Directories of joined bill-weather files
  dir_data       <- "S:/Edward/JoinedBillWeather/"
  dir_data_pge   <- paste0(dir_data, "PGE/")
  dir_data_socal <- paste0(dir_data, "SoCal/")
  dir_data_sdge  <- paste0(dir_data, "SDGE/")
  dir_elasticity <- "S:/Edward/Elasticity/"
  dir_csv        <- paste0(dir_elasticity, "DataCsv/")
  dir_rds        <- paste0(dir_elasticity, "DataR/")
  dir_sample     <- paste0(dir_rds, "Prices/SampledSubsets/SoCal05Percent/")
  # More options
  options(datatable.print.nrows = 10)
  # Set seed
  set.seed(12345)

# Load relevant datasets -------------------------------------------------------
	# Create vector of zip codes for each utility (based upon existing files)
	pge_zips <- dir_data_pge %>% dir() %>%
		gsub(pattern = "[^0-9]", replacement = "")
	socal_zips <- dir_data_socal %>% dir() %>%
		gsub(pattern = "[^0-9]", replacement = "")
	sdge_zips <- dir_data_sdge %>% dir() %>%
		gsub(pattern = "[^0-9]", replacement = "")
  # Create table of zip codes and presence of utilities
  all_zip_dt <- data.table(zip = unique(c(pge_zips, socal_zips, sdge_zips)))
  all_zip_dt[, `:=`(
    pge   = zip %in% pge_zips,
    sdge  = zip %in% sdge_zips,
    socal = zip %in% socal_zips
    )]
  # Save the table as CSV
  write.csv(x = all_zip_dt,
    file = paste0(dir_csv, "utilityZipCodeMap.csv"),
    row.names = F)
  # Vector of the zips common to PG&E and SoCalGas
  common_zips <- all_zip_dt[pge == T & socal == T]$zip
	# Vector of all unique zips
	all_zips <- unique(c(pge_zips, socal_zips, sdge_zips))
	all_zips <- all_zips[order(all_zips)]
  # Load daily allowance dataset
  socal_allowance <- fread(paste0(dir_csv, "socalBaselinesSimple.csv"))
  # Load price dataset
  socal_prices <- fread(paste0(dir_csv, "socalResidentialRatesCleaned.csv"))
  # Load degree day data
  dd_dt <- readRDS(paste0(dir_rds, "degreeDayNARR.rds"))
  # Load Henry Hub spot price data
  hub_dt <- fread(paste0(dir_csv, "dailyHenryHubPricesWithLags.csv"))

# Data work with degree-day dataset --------------------------------------------
  # Convert data to Date format
  dd_dt[, date := as.Date(date)]
  # Convert degree-day dataset to wide
  dd_ca <- dd_dt[region == "ca"][, region := NULL]
  dd_east <- dd_dt[region == "east"][, region := NULL]
  setnames(dd_ca, paste0("ca_", names(dd_ca)))
  setnames(dd_east, paste0("east_", names(dd_east)))
  setnames(dd_ca, "ca_date", "date")
  setnames(dd_east, "east_date", "date")
  dd_dt <- merge(dd_ca, dd_east, by = "date")
  # Create measures of total degree days for last month, 6 months, and year
  dd_dt[, `:=`(
    ca_6month_hdd =
      ca_tot_hdd_lag_month01 + ca_tot_hdd_lag_month02 +
      ca_tot_hdd_lag_month03 + ca_tot_hdd_lag_month04 +
      ca_tot_hdd_lag_month05 + ca_tot_hdd_lag_month06,
    ca_6month_cdd =
      ca_tot_cdd_lag_month01 + ca_tot_cdd_lag_month02 +
      ca_tot_cdd_lag_month03 + ca_tot_cdd_lag_month04 +
      ca_tot_cdd_lag_month05 + ca_tot_cdd_lag_month06,
    east_6month_hdd =
      east_tot_hdd_lag_month01 + east_tot_hdd_lag_month02 +
      east_tot_hdd_lag_month03 + east_tot_hdd_lag_month04 +
      east_tot_hdd_lag_month05 + east_tot_hdd_lag_month06,
    east_6month_cdd =
      east_tot_cdd_lag_month01 + east_tot_cdd_lag_month02 +
      east_tot_cdd_lag_month03 + east_tot_cdd_lag_month04 +
      east_tot_cdd_lag_month05 + east_tot_cdd_lag_month06
    )]
  dd_dt[, `:=`(
    ca_6month_dd = ca_6month_cdd + ca_6month_hdd,
    east_6month_dd = east_6month_cdd + east_6month_hdd
    )]
  dd_dt[, `:=`(
    ca_year_hdd = ca_6month_hdd +
      ca_tot_hdd_lag_month07 + ca_tot_hdd_lag_month08 +
      ca_tot_hdd_lag_month09 + ca_tot_hdd_lag_month10 +
      ca_tot_hdd_lag_month11 + ca_tot_hdd_lag_month12,
    ca_year_cdd = ca_6month_cdd +
      ca_tot_cdd_lag_month07 + ca_tot_cdd_lag_month08 +
      ca_tot_cdd_lag_month09 + ca_tot_cdd_lag_month10 +
      ca_tot_cdd_lag_month11 + ca_tot_cdd_lag_month12,
    east_year_hdd = east_6month_hdd +
      east_tot_hdd_lag_month07 + east_tot_hdd_lag_month08 +
      east_tot_hdd_lag_month09 + east_tot_hdd_lag_month10 +
      east_tot_hdd_lag_month11 + east_tot_hdd_lag_month12,
    east_year_cdd = east_6month_cdd +
      east_tot_cdd_lag_month07 + east_tot_cdd_lag_month08 +
      east_tot_cdd_lag_month09 + east_tot_cdd_lag_month10 +
      east_tot_cdd_lag_month11 + east_tot_cdd_lag_month12
    )]
  dd_dt[, `:=`(
    ca_year_dd = ca_year_cdd + ca_year_hdd,
    east_year_dd = east_year_cdd + east_year_hdd
    )]
  dd_dt[, `:=`(
    ca_1month_dd = ca_tot_hdd_lag_month01 + ca_tot_cdd_lag_month01,
    east_1month_dd = east_tot_hdd_lag_month01 + east_tot_cdd_lag_month01
    )]
  # Change names
  setnames(dd_dt,
    old = c("ca_tot_hdd_lag_month01", "ca_tot_cdd_lag_month01",
      "east_tot_hdd_lag_month01", "east_tot_cdd_lag_month01"),
    new = c("ca_1month_hdd", "ca_1month_cdd",
      "east_1month_hdd", "east_1month_cdd"))
  # Keep subset of variables
  dd_dt <- dd_dt[, .(date,
    ca_1month_dd, ca_1month_hdd, ca_1month_cdd,
    east_1month_dd, east_1month_hdd, east_1month_cdd,
    ca_6month_dd, ca_6month_hdd, ca_6month_cdd,
    east_6month_dd, east_6month_hdd, east_6month_cdd,
    ca_year_dd, ca_year_hdd, ca_year_cdd,
    east_year_dd, east_year_hdd, east_year_cdd
    )]

# Fix formatting in price/allowance data ---------------------------------------
  # Convert characters to dates where appropriate
  socal_prices[, date_eff := date_eff %>% mdy() %>% as.Date()]
  # Subset price data to 2009 through 2016
  # PGE bill data span 2003-2014; SoCal bill data span 2010-2015
  socal_prices <- socal_prices[between(year(date_eff), 2009, 2016)]
  # Remove dollar signs from prices; convert to numeric
  # (left over from original XLS files)
  tmp <- socal_prices[, 3:ncol(socal_prices), with = F]
  for(j in 1:ncol(tmp)) {
    set(x = tmp, j = j, value = gsub("\\$", "", tmp[[j]]) %>% as.numeric())
  }
  socal_prices <- cbind(socal_prices[, 1:2, with = F], tmp); rm(tmp)
  # Drop eff_start, eff_stop, season, and month
  socal_allowance[, season := NULL]
  # Change name of "base_" to "allow_"
  setnames(socal_allowance, "allowance", "allow_ind")
  # Drop SoCal price data based upon advice letters 4053 and 4061
  socal_prices <- socal_prices[
    !(advice_no %in% c(3644, 3680, 3695, 3807, 4053, 4061))]
  # Drop advice number from the price data
  socal_prices[, advice_no := NULL]
  # Change date_eff to date_eff_price
  setnames(socal_prices, "date_eff", "date_eff_price")
  # Change "climate_zone" to "climate" (SoCal)
  setnames(socal_allowance, "climate_zone", "climate")
  # Change name of non-care PPPS variable
  setnames(socal_prices, "ppps_non_care", "ppps")
  # Drop variables that I do not need
  socal_prices[, c("discount_submeter", "discount_care_submeter") := NULL]
  # Merge degree-data data with price data
  socal_prices %<>% merge(y = dd_dt, by.x = "date_eff_price", by.y = "date",
    all.x = T, all.y = F, sort = F)
  # Create datasets of dates on which price regimes become effective
  socal_price_dates <- socal_prices[, .(date_eff_price)]
  # Add two leads of effective price date
  setorder(socal_price_dates, date_eff_price)
  socal_price_dates[, c(paste0("date_eff_price_", 1:2, "lead")) :=
    shift(date_eff_price, 1L:2L, NA, "lead")]

# Join Henry Hub spot price data with utilities' price data --------------------
  # Convert spot price dates to Date format
  hub_dt[, date := date %>% ymd() %>% as.Date()]
  # Drop the day's spot price (we're keeping the weekly averages)
  hub_dt[, spot_price := NULL]
  # Merge the datasets
  socal_prices %<>% merge(y = hub_dt, by.x = "date_eff_price", by.y = "date",
    all.x = T, all.y = F, sort = F)

# Convert zip code files to R format -------------------------------------------
#  # Start the cluster
#  cl <- makeCluster(6)
#  registerDoParallel(cl)
#  # For each zip code: open the relevant PGE/SoCal/SDGE files; save as .rds
#	foreach(the_zip = all_zips,
#    .packages = c("readstata13", "data.table")) %dopar% {
#      # If there is a PGE file, save it as .rds
#      if(the_zip %in% pge_zips) saveRDS(object =
#        read.dta13(paste0(dir_data_pge, "joined_pge_", the_zip, ".dta")),
#        file = paste0(dir_rds, "JoinedWeather/joined_pge_", the_zip, ".rds"))
#      # If there is a SoCal file, save it as .rds
#      if(the_zip %in% socal_zips) saveRDS(object =
#        read.dta13(paste0(dir_data_socal, "joined_socal_", the_zip, ".dta")),
#        file = paste0(dir_rds, "JoinedWeather/joined_socal_", the_zip, ".rds"))
#      # If there is a SDGE file, save it as .rds
#      if(the_zip %in% sdge_zips) saveRDS(object =
#        read.dta13(paste0(dir_data_sdge, "joined_sdge_", the_zip, ".dta")),
#        file = paste0(dir_rds, "JoinedWeather/joined_sdge_", the_zip, ".rds"))
#    }
#  # Kill cluster
#  stopCluster(cl)

# Open billing files and attach price data -------------------------------------
  # Start the cluster
  cl <- makeCluster(12)
  registerDoParallel(cl)
  # Loop over the zip codes
  foreach(the_zip = socal_zips,
#  foreach(the_zip = common_zips,
    .packages = c("data.table", "magrittr", "lubridate", "stringr")) %dopar% {
    #the_zip <- 93224
    #the_zip <- 93311
    # Load the zip code's SoCal utility file
    socal_dt <- readRDS(paste0(dir_rds, "JoinedWeather/joined_socal_",
      the_zip, ".rds")) %>% data.table()
    # Grab temperature bin columns
    socal_weather <- socal_dt[,
      c("begin_date", "end_date", "hh_id", paste0("t_p_", 1:14), "days"),
      with = F]
    # Convert temperature bins from 30-day equivalent to actual numbers of days
# HACK: We will have different numbers of days for weather and bill, since
# my definition of bill length change below (relative to the definition that
# I used in the weather calculations).
    socal_weather[, `:=`(
      t_p_1  = t_p_1  * days / 30,
      t_p_2  = t_p_2  * days / 30,
      t_p_3  = t_p_3  * days / 30,
      t_p_4  = t_p_4  * days / 30,
      t_p_5  = t_p_5  * days / 30,
      t_p_6  = t_p_6  * days / 30,
      t_p_7  = t_p_7  * days / 30,
      t_p_8  = t_p_8  * days / 30,
      t_p_9  = t_p_9  * days / 30,
      t_p_10 = t_p_10 * days / 30,
      t_p_11 = t_p_11 * days / 30,
      t_p_12 = t_p_12 * days / 30,
      t_p_13 = t_p_13 * days / 30,
      t_p_14 = t_p_14 * days / 30
      )][, days := NULL]
    # Drop temperature variables
    socal_dt <- socal_dt[, 1:14, with = F]
    # Drop other unwanted variables
    socal_dt[, c("meter_id", "acct_id", "zip5",
      "year") := NULL]
    # Drop customers that are not standard residential or CARE residential
    socal_dt <- socal_dt[rate_sch %in% c("GR", "GRL")]
    # Recalculate bill lengths
		# Example: 12/1-12/15 will be 15 days, rather than 16 days
    socal_dt[, days := (end_date - begin_date) %>% as.numeric()]
		# Create month and year variables
		socal_dt[, `:=`(
			month = str_pad(month(begin_date), 2, "left", 0),
			year = year(begin_date))]
		# Create year-month variable
		socal_dt[, ym :=
			paste0(year, month)]

    # Rolling merge price dates onto utility dataset ---------------------------
    # Add copies of columns used in the rolling merge
    socal_dt[, begin_date_copy := begin_date]
    socal_price_dates[, date_eff_price_copy := date_eff_price]
    # Set keys for the rolling merge
    setkey(socal_dt, begin_date_copy, hh_id)
    setkey(socal_price_dates, date_eff_price_copy)
    # The rolling merge: merge using the begin date of the bill and the price
    # regime that precedes the begin date
    socal_dt <- socal_price_dates[socal_dt, roll = Inf]
    socal_dt[, date_eff_price_copy := NULL]
    # Change names of date_eff and its leads
    setnames(socal_dt, paste0("date_eff_price", c("", "_1lead", "_2lead")),
      paste0("date_price", 1:3))
    # Add the month for each of the date_prices
    socal_dt[, `:=`(
      month1 = month(date_price1),
      month2 = month(date_price2),
      month3 = month(date_price3)
      )]

    # Merge billing data with price and allowance data -------------------------
    # For each of the three "date_price"s
    #   (1) Change names in socal_allowance
    #   (2) Change names in socal_prices
    #   (3) Merge the billing and allowance data
    #   (4) Merge the billing and price data
    # SoCal: date_price1
    setnames(socal_allowance, paste0(names(socal_allowance), 1))
    setnames(socal_prices, paste0(names(socal_prices), 1))
    socal_dt %<>% merge(y = socal_allowance,
      by.x = c("climate", "month1"),
      by.y = c("climate1", "month1"),
      all.x = T, all.y = F)
    socal_dt %<>% merge(y = socal_prices,
      by.x = "date_price1", by.y = "date_eff_price1", all.x = T, all.y = F)
    # SoCal: date_price2
    setnames(socal_allowance, gsub("[1]$", "2", names(socal_allowance)))
    setnames(socal_prices, gsub("[1]$", "2", names(socal_prices)))
    socal_dt %<>% merge(y = socal_allowance,
      by.x = c("climate", "month2"),
      by.y = c("climate2", "month2"),
      all.x = T, all.y = F)
    socal_dt %<>% merge(y = socal_prices,
      by.x = "date_price2",
      by.y = "date_eff_price2",
      all.x = T, all.y = F)
    # SoCal: date_price3
    setnames(socal_allowance, gsub("[2]$", "3", names(socal_allowance)))
    setnames(socal_prices, gsub("[2]$", "3", names(socal_prices)))
    socal_dt %<>% merge(y = socal_allowance,
      by.x = c("climate", "month3"),
      by.y = c("climate3", "month3"),
      all.x = T, all.y = F)
    socal_dt %<>% merge(y = socal_prices,
      by.x = "date_price3",
      by.y = "date_eff_price3",
      all.x = T, all.y = F)
    # Return SoCal column names to unnumbered
    setnames(socal_allowance, gsub("[3]$", "", names(socal_allowance)))
    setnames(socal_prices, gsub("[3]$", "", names(socal_prices)))

    # Determine days spent in each price regime --------------------------------
    # Alphabetize columns
    setcolorder(socal_dt, order(names(socal_dt)))
    # Calculate the number of days within each of the unique periods.
    # NOTE: The bill can end in any of the three regimes, which affects how the
    # we calculate the number of days in the regimes.
    # First: Bills that end before the second price regime begins
    socal_dt[end_date <= date_price2, `:=`(
      days1 = (end_date - begin_date) %>% as.numeric(),
      days2 = 0,
      days3 = 0
      )]
    # Second: Bills that end during the second price regime
    socal_dt[(end_date > date_price2) & (end_date <= date_price3), `:=`(
      days1 = (date_price2 - begin_date) %>% as.numeric(),
      days2 = (end_date - date_price2) %>% as.numeric(),
      days3 = 0
      )]
    # Third: Bills that end during the third price regime
    socal_dt[end_date > date_price3, `:=`(
      days1 = (date_price2 - begin_date) %>% as.numeric(),
      days2 = (date_price3 - date_price2) %>% as.numeric(),
      days3 = (end_date - date_price3) %>% as.numeric()
      )]
    # Alphabetize column order
    setcolorder(socal_dt, order(names(socal_dt)))
    # Order rows
    setorder(socal_dt, hh_id, begin_date)

    # Calculate prices and fees ------------------------------------------------
    # Calculate baseline allowance for the bill
    #NOTE: I assume all bills are for individual households (not multifamily)
    socal_dt[, allowance :=
      (days1 * allow_ind1) +
      (days2 * allow_ind2) +
      (days3 * allow_ind3)]
    # Determine whether the household exceeded their baseline allowance
    socal_dt[, exceeded := thm > allowance]
    # By how much did the household exceed their baseline?
    socal_dt[, thm_excess := thm - allowance]
    socal_dt[exceeded == F, thm_excess := 0]
    # Add the number of therms on which the household paid baseline rate
    socal_dt[exceeded == T, thm_base := allowance]
    socal_dt[exceeded == F, thm_base := thm]
    # Fill in the missing SoCal CARE values (at some point, SoCal stopped
    # reporting CARE values, as they were/are 80% of the full value).
    socal_dt[is.na(charge_care_base1),
      charge_care_base1 := 0.8 * charge_base1]
    socal_dt[is.na(charge_care_base2),
      charge_care_base2 := 0.8 * charge_base2]
    socal_dt[is.na(charge_care_base3),
      charge_care_base3 := 0.8 * charge_base3]
    socal_dt[is.na(charge_care_excess1),
      charge_care_excess1 := 0.8 * charge_excess1]
    socal_dt[is.na(charge_care_excess2),
      charge_care_excess2 := 0.8 * charge_excess2]
    socal_dt[is.na(charge_care_excess3),
      charge_care_excess3 := 0.8 * charge_excess3]
    socal_dt[is.na(charge_meter_care1),
      charge_meter_care1 := 0.8 * charge_meter1]
    socal_dt[is.na(charge_meter_care2),
      charge_meter_care2 := 0.8 * charge_meter2]
    socal_dt[is.na(charge_meter_care3),
      charge_meter_care3 := 0.8 * charge_meter3]
    socal_dt[is.na(charge_customer_care1),
      charge_customer_care1 := 0.8 * charge_customer1]
    socal_dt[is.na(charge_customer_care2),
      charge_customer_care2 := 0.8 * charge_customer2]
    socal_dt[is.na(charge_customer_care3),
      charge_customer_care3 := 0.8 * charge_customer3]
    # Calculate weighted averages of prices
		# (where weights are based upon days in regimes)
    # NOTE: For SoCal, I need to include add the prices, the PPPS, and the
    # customer charge. The PPPS is assessed on every therm. The customer
    # charge is assessed per day per meter (and does not change in our time
		# period).
    socal_dt[, `:=`(
      charge_base = (
        days1 * charge_base1 +
        days2 * charge_base2 +
        days3 * charge_base3
        ) / days,
      charge_care_base = (
        days1 * charge_care_base1 +
        days2 * charge_care_base2 +
        days3 * charge_care_base3
        ) / days,
      charge_excess = (
        days1 * charge_excess1 +
        days2 * charge_excess2 +
        days3 * charge_excess3
        ) / days,
      charge_care_excess = (
        days1 * charge_care_excess1 +
        days2 * charge_care_excess2 +
        days3 * charge_care_excess3
        ) / days,
      charge_customer = (
        days1 * charge_customer1 +
        days2 * charge_customer2 +
        days3 * charge_customer3
        ) / days,
      charge_care_customer = (
        days1 * charge_customer_care1 +
        days2 * charge_customer_care2 +
        days3 * charge_customer_care3
        ) / days,
      ppps = (
        days1 * ppps1 +
        days2 * ppps2 +
        days3 * ppps3) /
         days,
      ppps_care = (
        days1 * ppps_care1 +
        days2 * ppps_care2 +
        days3 * ppps_care3
        ) / days
      )]

    # Weight degree days and spot prices ---------------------------------------
    # Calculate weighted averages of degree-day/temperature tallies
    socal_dt[, `:=`(
      ca_year_dd  = (days1 * ca_year_dd1 +
        days2 * ca_year_dd2 + days3 * ca_year_dd3) / days,
      ca_year_cdd = (days1 * ca_year_cdd1 +
        days2 * ca_year_cdd2 + days3 * ca_year_cdd3) / days,
      ca_year_hdd = (days1 * ca_year_hdd1 +
        days2 * ca_year_hdd2 + days3 * ca_year_hdd3) / days,
      east_year_dd  = (days1 * east_year_dd1 +
        days2 * east_year_dd2 + days3 * east_year_dd3) / days,
      east_year_cdd = (days1 * east_year_cdd1 +
        days2 * east_year_cdd2 + days3 * east_year_cdd3) / days,
      east_year_hdd = (days1 * east_year_hdd1 +
        days2 * east_year_hdd2 + days3 * east_year_hdd3) / days
      )]
    # Calculate period-length-weighed averages of weekly spot price variables
    socal_dt[, `:=`(
      spot_price_1week = (days1 * spot_price_lag_1_71 +
        days2 * spot_price_lag_1_72 + days3 * spot_price_lag_1_73) / days,
      spot_price_2week = (days1 * spot_price_lag_8_141 +
        days2 * spot_price_lag_8_142 + days3 * spot_price_lag_8_143) / days,
      spot_price_3week = (days1 * spot_price_lag_15_211 +
        days2 * spot_price_lag_15_212 + days3 * spot_price_lag_15_213) / days,
      spot_price_4week = (days1 * spot_price_lag_22_281 +
        days2 * spot_price_lag_22_282 + days3 * spot_price_lag_22_283) / days,
      spot_price_5week = (days1 * spot_price_lag_29_351 +
        days2 * spot_price_lag_29_352 + days3 * spot_price_lag_29_353) / days,
      spot_price_6week = (days1 * spot_price_lag_36_421 +
        days2 * spot_price_lag_36_422 + days3 * spot_price_lag_36_423) / days,
      NULL = NULL)]
      # Keep the degree day and spot price variables from the first price and
      # allowance regimes (the regimes in place when the bill began). I use
      # these values in an alternate empirical strategy.
      socal_dt[, `:=`(
        charge_base_init        = charge_base1,
        charge_care_base_init   = charge_care_base1,
        charge_excess_init      = charge_excess1,
        charge_care_excess_init = charge_care_excess1,
        ca_year_dd_init         = ca_year_dd1,
        ca_year_cdd_init        = ca_year_cdd1,
        ca_year_hdd_init        = ca_year_hdd1,
        east_year_dd_init       = east_year_dd1,
        east_year_cdd_init      = east_year_cdd1,
        east_year_hdd_init      = east_year_hdd1,
        spot_price_1week_init   = spot_price_lag_1_71,
        spot_price_2week_init   = spot_price_lag_8_141,
        spot_price_3week_init   = spot_price_lag_15_211,
        spot_price_4week_init   = spot_price_lag_22_281,
        spot_price_5week_init   = spot_price_lag_29_351,
        spot_price_6week_init   = spot_price_lag_36_421,
        NULL = NULL)]

    # Calculate bills ----------------------------------------------------------
    # NOTE: It appears as though the PG&E "rev" variable excludes PPPS-related
    #  charges. SoCal's data has the actual total bill.
    # Drop period-specific variables (all end with 1, 2, or 3)
    socal_dt[,
			grep(pattern = "[1-3]$", x = names(socal_dt), value = T) := NULL]
    # Calculate bill revenue using prices and quantities
    socal_dt[, `:=`(
      rev_noncare = thm_base * charge_base +
				thm_excess * charge_excess +
        days * charge_customer + thm * ppps,
      rev_care = thm_base * charge_care_base +
				thm_excess * charge_care_excess +
        days * charge_care_customer + thm * ppps_care
      )]
    # Calculate the error for each revenue calculation
    socal_dt[, `:=`(
      rev_noncare_error = rev - rev_noncare,
      rev_care_error = rev - rev_care
      )]
    # Find the minimum absolute error for each bill
    socal_dt[, row_no := 1:.N][,
      rev_error_min := min(abs(rev_noncare_error), abs(rev_care_error)),
      by = row_no][, row_no := NULL]
    # Determine if the HH is a CARE HH using nearest bill estimate
    # NOTE: The rate schedule GRL represents CARE households for SoCal. I
    # am able to predict all of the GRL households; however, I also classify
    # many GR households as CARE. I also need to find PG&E CARE households.
    socal_dt[, care := rev_error_min == abs(rev_care_error)]
    # Calculate the percentage of observations that I classify as CARE
    socal_dt[, care_pct := sum(care == T) / .N, by = hh_id]
    # I'm ignoring the next statement for now:
      # If >= 75% or more of a household's observations are "CARE", then
      # identify the household as CARE; if <= 25% of a household's obsevations
      # are non-CARE, then classify the household as non-CARE
    # socal_dt[care_pct >= 0.75, care := T]
    # socal_dt[care_pct <= 0.25, care := F]
# HACK: Should I fix the numbers to provide full coverage?
    # Use the CARE classification to determine the predicted rev and errors
    socal_dt[care == T, rev_hat := rev_care]
    socal_dt[care == F, rev_hat := rev_noncare]
    socal_dt[, rev_error := rev - rev_hat]
    # Delete unwanted columns
    socal_dt[, c("rev_care", "rev_care_error", "rev_noncare",
      "rev_noncare_error", "rev_error_min") := NULL]
    # Find percentage error
    socal_dt[, pct_error := rev_error / rev]
    # Flag bills where the percentage error is more than 5 percent
    socal_dt[, bill_flag := abs(pct_error) > 0.05]
    # Extend a lag of the flag (to know if the previous bill was flagged)
    setorder(socal_dt, hh_id, begin_date)
    socal_dt[,
      bill_flag_lag1 := shift(bill_flag, n = 1L, fill = NA, type = "lag"),
      by = hh_id]
    # Count the number of flags within a household
    socal_dt[, n_flags := sum(bill_flag), hh_id]
    # Calculate the percentage of flags within a household
    socal_dt[, pct_flags := n_flags / .N, hh_id]

    # Calculate prices ---------------------------------------------------------
    # Determine the relevant PPPS rate
    socal_dt[care == T, ppps_charge := ppps_care]
    socal_dt[care == F, ppps_charge := ppps]
    # Create variables for the baseline and excess prices
    socal_dt[care == T, price_base := charge_care_base]
    socal_dt[care == F, price_base := charge_base]
    socal_dt[care == T, price_excess := charge_care_excess]
    socal_dt[care == F, price_excess := charge_excess]
    # Likewise for "initial" prices
    socal_dt[care == T, price_base_init := charge_care_base_init]
    socal_dt[care == F, price_base_init := charge_base_init]
    socal_dt[care == T, price_excess_init := charge_care_excess_init]
    socal_dt[care == F, price_excess_init := charge_excess_init]
    # Determine marginal prices
    socal_dt[exceeded == F, price_mrg := price_base]
    socal_dt[exceeded == T, price_mrg := price_excess]
    # Calculate average marginal price (therm-weighted-average of mrg. prices)
    socal_dt[, price_avg_mrg :=
      ((thm_base * price_base) + (thm_excess * price_excess)) / thm]
    # Update preceding calculations with the PPPS volumetric charges:
    # Baseline and excess prices
    socal_dt[, price_base_ppps := price_base + ppps_charge]
    socal_dt[, price_excess_ppps := price_excess + ppps_charge]
    # "Initial" prices
    socal_dt[, price_base_init_ppps := price_base_init + ppps_charge]
    socal_dt[, price_excess_init_ppps := price_excess_init + ppps_charge]
    # Marginal prices
    socal_dt[, price_mrg_ppps := price_mrg + ppps_charge]
    # Average marginal price
    socal_dt[, price_avg_mrg_ppps := price_avg_mrg + ppps_charge]

    # Calculate total bill -----------------------------------------------------
    # If ppps_charge is missing, make it zero (started halfway through data)
    socal_dt[is.na(ppps_charge), ppps_charge := 0]
    # Calculate total bill
    socal_dt[, total_bill :=
      thm_base * price_base + thm_excess * price_excess + thm * ppps_charge]
    # Add customer charges
    socal_dt[care == T,
      total_bill := total_bill + charge_care_customer]
    socal_dt[care == F,
      total_bill := total_bill + charge_customer]
    # Calculate average prices: total bill divided by therms, where total
    # bill includes PPPS and customer charges.
    socal_dt[, price_avg := total_bill / thm]

    # Add misc. variables ------------------------------------------------------
		# Add utility (helpful upon full merge)
		socal_dt[, utility := "socal"]
		# Add utility and zip code to the household ID (and pad exiting ID)
		socal_dt[, hh_id :=
			paste0("socal", the_zip, str_pad(hh_id, 8, "left", 0))]
    socal_weather[, hh_id :=
			paste0("socal", the_zip, str_pad(hh_id, 8, "left", 0))]
    # Fix names of temperature bins (pad bin number with zeros)
    old_bins <- socal_weather %>% names() %>% grep("t_p_", ., value = T)
    new_bins <- old_bins %>% gsub("t_p_", "", .) %>% str_pad(2, "left", 0)
    new_bins %<>% paste0("t_p_", .)
    setnames(socal_weather, old = old_bins, new = new_bins)
    # Calculate heating degree days within bill
    # HACK: These intra-bill HDDs are approximate
    # NOTE: See Eds_excellent_adventure.do for explanation of bins
    socal_weather[, bill_hdd :=
      t_p_01 * (65 - 24) +
      t_p_02 * (65 - (35 - 24) / 2) +
      t_p_03 * (65 - (40 - 35) / 2) +
      t_p_04 * (65 - (47 - 40) / 2) +
      t_p_05 * (65 - (51 - 47) / 2) +
      t_p_06 * (65 - (55 - 51) / 2) +
      t_p_07 * (65 - (59 - 55) / 2) +
      t_p_08 * (65 - (63 - 59) / 2)]
    # Merge weather data back onto the billing data
    socal_dt %<>% merge(., y = socal_weather,
      by = c("hh_id", "begin_date", "end_date"),
      all.x = T, all.y = F, sort = F)
		# Create household-month FEs
		socal_dt[, hh_month := paste0(hh_id, month)]
    # Create zip-9 variable
	  #NOTE: we are missing many of the 4-digit extensions of the zip;
	  # for now, they get their own 4-digit extension (e.g. 94702NA)
	  socal_dt[, zip9 := paste0(zip, zip_plus4)]
    # Create therms per day variable
    socal_dt[, thm_day := thm / days]

    # Add lags (and leads) for simulated instruments ---------------------------
		# Add thm lags for simulated instruments (10 and 14 periods)
		setorder(socal_dt, hh_id, begin_date)
		socal_dt[, paste0("thm_lag", 10:14) :=
			shift(thm, n = c(10L:14L), fill = NA, type = "lag", give.names = F),
			by = hh_id]
		# Add days lags for simulated instruments (10 and 14 periods)
		setorder(socal_dt, hh_id, begin_date)
		socal_dt[, paste0("days_lag", 10:14) :=
			shift(days, n = c(10L:14L), fill = NA, type = "lag", give.names = F),
			by = hh_id]

    # Construct simulated instruments ------------------------------------------
    # Determine which lagged periods the HH exceeded the current allowance
    socal_dt[, `:=`(
      exceeded_sim_10 = (thm_lag10 / days_lag10) > (allowance / days),
      exceeded_sim_11 = (thm_lag11 / days_lag11) > (allowance / days),
      exceeded_sim_12 = (thm_lag12 / days_lag12) > (allowance / days),
      exceeded_sim_13 = (thm_lag13 / days_lag13) > (allowance / days),
      exceeded_sim_14 = (thm_lag14 / days_lag14) > (allowance / days)
      )]
    # Pct the HH exceeded their allowance using lags 10-14 and 11-13
    socal_dt[, `:=`(
      exceeded_sim_10_14 = (
        ifelse(is.na(exceeded_sim_10), 0, exceeded_sim_10) +
        ifelse(is.na(exceeded_sim_11), 0, exceeded_sim_11) +
        ifelse(is.na(exceeded_sim_12), 0, exceeded_sim_12) +
        ifelse(is.na(exceeded_sim_13), 0, exceeded_sim_13) +
        ifelse(is.na(exceeded_sim_14), 0, exceeded_sim_14)
      ) / (
        (!is.na(exceeded_sim_10)) +
        (!is.na(exceeded_sim_11)) +
        (!is.na(exceeded_sim_12)) +
        (!is.na(exceeded_sim_13)) +
        (!is.na(exceeded_sim_14))
      ),
      exceeded_sim_11_13 = (
        ifelse(is.na(exceeded_sim_11), 0, exceeded_sim_11) +
        ifelse(is.na(exceeded_sim_12), 0, exceeded_sim_12) +
        ifelse(is.na(exceeded_sim_13), 0, exceeded_sim_13)
      ) / (
        (!is.na(exceeded_sim_11)) +
        (!is.na(exceeded_sim_12)) +
        (!is.na(exceeded_sim_13))
      ) )]
    # Create simulated instrument variables (10-14 and 11-13)
    socal_dt[, `:=`(
      mrg_sim_iv_10_14 =
        (exceeded_sim_10_14 <= 0.5) * price_base +
        (exceeded_sim_10_14 > 0.5) * price_excess,
      mrg_sim_iv_11_13 =
        (exceeded_sim_11_13 <= 0.5) * price_base +
        (exceeded_sim_11_13 > 0.5) * price_excess
      )]

    # Add more lags and leads --------------------------------------------------
    # Temperature bin names
    bin_names <- paste0("t_p_", str_pad(1:14, 2, "left", 0))
    # Names of variables to lag/lead
    var_names <- c(
      # Non-price variables
      "allowance", "thm", "thm_day",
      "days", "month", bin_names,
      "bill_hdd",
      "east_year_hdd", "ca_year_hdd",
      "east_year_cdd", "ca_year_cdd",
      "east_year_dd", "ca_year_dd",
      "total_bill", "bill_flag", "care",
      # Price variables
      "price_avg", "price_avg_mrg",
      "price_base", "price_base_ppps", "price_base_init",
      "price_excess", "price_excess_ppps", "price_excess_init",
      "ppps_charge",
      "price_mrg", "price_mrg_ppps",
      "mrg_sim_iv_10_14", "mrg_sim_iv_11_13",
      "spot_price_1week","spot_price_1week_init",
      "spot_price_2week", "spot_price_2week_init",
      "spot_price_3week", "spot_price_3week_init")
    # Define the number of lags and leads
    the_lags <- c(1:15)
    the_leads <- 1:3
    # Names for new lag/lead variables
    lag_names <- paste0(rep(var_names, each = length(the_lags)),
      "_lag", the_lags)
    lead_names <- paste0(rep(var_names, each = length(the_leads)),
      "_lead", the_leads)
    # Drop variables whose names overlap with the list of lags/leads
    socal_dt[, intersect(c(lag_names, lead_names), names(socal_dt)) := NULL]
    # Order the rows by household and bills' start dates
    setorder(socal_dt, "hh_id", "begin_date")
    # Create lags
    socal_dt[, (lag_names) := shift(.SD, the_lags, NA, "lag"),
      by = hh_id, .SDcols = var_names]
    # Create leads
    socal_dt[, (lead_names) := shift(.SD, the_leads, NA, "lead"),
      by = hh_id, .SDcols = var_names]

    # Save full dataset --------------------------------------------------------
		# Order columns alphabetically
		setcolorder(socal_dt, order(names(socal_dt)))
		# Save
		saveRDS(object = socal_dt,
			file = paste0(dir_rds, "Prices/SoCal/prices_socal_", the_zip, ".rds"))

    # Create and save subset ---------------------------------------------------
    # Define the percentage of observations that we wish to keep
    pct <- 0.05
    # Compose set of unique household IDs with their counts; keep households
    # with at least 12 observations
    id_dt <- socal_dt[, .N, by = hh_id][N > 12]
    # Sample 'pct' percent of the IDs (rounding up to nearest integer)
    ids_sampled <- sample(
      x = id_dt[,hh_id],
      size = ceiling(nrow(id_dt) * pct))
    # Grab the subset of sampled households from bill_dt
    sample_dt <- socal_dt[hh_id %in% ids_sampled]
    # Save the sampled dataset
    saveRDS(object = sample_dt,
      file = paste0(dir_sample, "socal", the_zip, ".rds"))

    # The end ------------------------------------------------------------------
	}
  # Kill cluster
  stopCluster(cl)
  # Clean up
  gc()

# Data notes -------------------------------------------------------------------
#   - In the 2009-2015 subset of PG&E and SoCalGas, all tariff and allowance
#     changes occur on the first of the month.
#   - The prefix "H" for PG&E rate schedules implies smart metering.
#   - Other rate schedule codings (PG&E)
#       * G-1 – Residential Service
#       * GM – Master-Metered Multifamily Service
#       * GS – Multifamily Service
#       * GT – Mobilehome Park Service
#       * G1-NGV – Residential Natural Gas Service for Compression
#         on Customers' Premises
#       * GL-1 – Residential CARE Program Service
#       * GML – Master-Metered Multifamily CARE Program Service
#       * GSL – Multi family CARE Program Service
#       * GTL – Mobilehome Park CARE Program Service
#       * GL1-NGV – Residential CARE Program Natural Gas Service for
#         Compression on Customers' Premises.
#     source: http://www.pge.com/nots/rates/tariffs/tm2/pdf/GAS_3281-G.pdf
#   - SoCalGas uses GR for standard consumer rates and GRL for CARE
#     source: https://www.socalgas.com/regulatory/tariffs/tm2/pdf/3501.pdf
